{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e47899a3-0806-41d6-a71f-738e7ef9d8d3",
   "metadata": {},
   "source": [
    "# Introduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3c5250e-010c-4b4c-a5fa-43a3cc86df30",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m pip install polars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a05aa96-ae34-41de-a7ef-1498e6d94cab",
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "\n",
    "tips = pl.scan_parquet(\"tips.parquet\")\n",
    "\n",
    "tips.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22c85bb2-8b10-4075-ab58-3b212f1ed050",
   "metadata": {},
   "outputs": [],
   "source": [
    "(tips.null_count()).collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c94a5e17-883a-4728-ac18-e4381b793182",
   "metadata": {},
   "source": [
    "# How to Work With Missing Data in Polars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11bc9817-6c80-492d-8846-48451e68fcb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "\n",
    "tips = pl.scan_parquet(\"tips.parquet\")\n",
    "\n",
    "(tips.filter(pl.col(\"total\").is_null() & pl.col(\"tip\").is_null())).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d79f6c04-cfcd-45e5-aa36-4a097d6e2082",
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    tips.drop_nulls(pl.col(\"total\")).filter(\n",
    "        pl.col(\"total\").is_null() & pl.col(\"tip\").is_null()\n",
    "    )\n",
    ").collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b7de256-b058-4b6d-b802-822019b0b7eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    tips.drop_nulls(pl.col(\"total\"))\n",
    "    .with_columns(pl.col(\"tip\").fill_null(0))\n",
    "    .filter(pl.col(\"tip\").is_null())\n",
    ").collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c628e41c-fc20-4a56-85ea-9ff631e8d614",
   "metadata": {},
   "source": [
    "# Using a More Strategic Approach"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10fd34e7-e94e-47f1-b9da-533b0550c9b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "\n",
    "tips = pl.scan_parquet(\"tips.parquet\")\n",
    "\n",
    "(tips.filter(pl.col(\"time\").is_null())).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a84196c9-5032-4650-83dd-176319b6eed5",
   "metadata": {},
   "outputs": [],
   "source": [
    "(tips.filter(pl.col(\"record_id\").is_in([2, 3, 4, 14, 15, 16]))).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "acfdafa7-c9e0-49cc-8b1e-e4366ce2ac59",
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    tips.drop_nulls(\"total\")\n",
    "    .with_columns(pl.col(\"tip\").fill_null(0))\n",
    "    .with_columns(pl.col(\"time\").fill_null(strategy=\"forward\"))\n",
    "    .filter(pl.col(\"record_id\").is_in([3, 15]))\n",
    ").collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9c007132-c939-47b7-84b6-bf89c3da74a2",
   "metadata": {},
   "source": [
    "# Dealing With Nulls Across Multiple Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19504937-9a8b-48c9-b504-62db2bff178c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "\n",
    "tips = pl.scan_parquet(\"tips.parquet\")\n",
    "\n",
    "(tips.filter(pl.all_horizontal(pl.col(\"total\", \"tip\").is_null()))).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d5ba705-e675-4935-8aab-958a539bd66a",
   "metadata": {},
   "outputs": [],
   "source": [
    "tips = pl.scan_parquet(\"tips.parquet\")\n",
    "\n",
    "(tips.filter(~pl.all_horizontal(pl.col(\"total\", \"tip\").is_null()))).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29a6aab6-edb5-42cc-998b-7bd82f45ce8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "\n",
    "tips = pl.scan_parquet(\"tips.parquet\")\n",
    "\n",
    "(\n",
    "    tips.filter(~pl.all_horizontal(pl.col(\"total\", \"tip\").is_null()))\n",
    "    .with_columns(pl.col(\"tip\").fill_null(0))\n",
    "    .with_columns(pl.col(\"time\").fill_null(strategy=\"forward\"))\n",
    ").null_count().collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "32c00cbe-e300-4fd8-9a1e-f40371528fef",
   "metadata": {},
   "source": [
    "# Dealing With Nulls by Column Data Type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e29d50f-b9f8-4545-b954-040490e6f15c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "\n",
    "scientists = pl.LazyFrame(\n",
    "    {\n",
    "        \"scientist_id\": [1, 2, 3, 4, 5],\n",
    "        \"first_name\": [\"Isaac\", \"Louis\", None, \"Charles\", \"Marie\"],\n",
    "        \"last_name\": [None, \"Pasteur\", \"Einstein\", \"Darwin\", \"Curie\"],\n",
    "        \"birth_year\": [1642, 1822, None, 1809, 1867],\n",
    "        \"death_year\": [1726, 1895, 1955, None, 1934],\n",
    "    }\n",
    ")\n",
    "\n",
    "scientists.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6a5a990-d2cf-4dd2-8021-1a59e27c64d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars.selectors as cs\n",
    "\n",
    "(\n",
    "    scientists.with_columns(cs.string().fill_null(\"Unknown\")).with_columns(\n",
    "        cs.integer().fill_null(0)\n",
    "    )\n",
    ").collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f211113b-6988-4cf5-a0e9-c1c625b00148",
   "metadata": {},
   "source": [
    "# Dealing With Those Pesky NaNs and infs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b706a22-cc6a-49c9-858c-69bb3f72cb48",
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "\n",
    "sales_trends = pl.scan_csv(\"sales_trends.csv\")\n",
    "\n",
    "sales_trends.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5cde06c9-1a4c-45da-991d-cda5cd27542c",
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    sales_trends.with_columns(\n",
    "        pl.col(\"next_year\").replace(\n",
    "            [float(\"inf\"), -float(\"inf\"), float(\"NaN\")], None\n",
    "        )\n",
    "    )\n",
    ").collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "babf6ca8-101f-40f8-8224-426eeece5a81",
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    sales_trends.with_columns(\n",
    "        pl.col(\"next_year\").replace(\n",
    "            [float(\"inf\"), -float(\"inf\"), float(\"NaN\")], None\n",
    "        )\n",
    "    ).with_columns(\n",
    "        pl.col(\"next_year\").fill_null(\n",
    "            pl.col(\"current_year\")\n",
    "            + (pl.col(\"current_year\") - pl.col(\"last_year\"))\n",
    "        )\n",
    "    )\n",
    ").collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "903c4028-c3af-49ba-be08-e98afa785c09",
   "metadata": {},
   "source": [
    "#  Practicing Your Skills - Solution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d564123d-42da-462b-a52a-c6a815e59b0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "\n",
    "episodes = pl.scan_parquet(\"ft_exercise.parquet\")\n",
    "\n",
    "episodes.null_count().collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "000b53ba-c5d3-4a75-89d7-86c36881a078",
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "\n",
    "episodes = pl.scan_parquet(\"ft_exercise.parquet\")\n",
    "\n",
    "(\n",
    "    episodes.with_columns(\n",
    "        pl.when(pl.col(\"episode\") == 6)\n",
    "        .then(pl.col(\"series\").fill_null(strategy=\"forward\"))\n",
    "        .otherwise(pl.col(\"series\").fill_null(strategy=\"backward\"))\n",
    "    )\n",
    "    .with_columns(\n",
    "        pl.when(pl.col(\"episode\") == 4)\n",
    "        .then(pl.col(\"title\").fill_null(\"The Hotel Inspectors\"))\n",
    "        .otherwise(pl.col(\"title\").fill_null(\"Waldorf Salad\"))\n",
    "    )\n",
    "    .with_columns(pl.col(\"original_date\").interpolate())\n",
    ").null_count().collect()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
